# A comprehensive tool for analyzing and visualizing dataset statistics, including class distribution and bounding box size metrics.

import os
import pandas as pd
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt

def analyze_dataset_statistics(image_dir, label_dir):
    """
    Analyze dataset statistics including:
    1. Total number of images
    2. Class distribution
    3. Bounding box size distribution
    4. Images per class
    """
    total_images = 0
    class_counts = Counter()
    bbox_sizes = []
    
    # Analyze all label files
    for label_file in os.listdir(label_dir):
        if not label_file.endswith('.txt'):
            continue
            
        total_images += 1
        label_path = os.path.join(label_dir, label_file)
        
        try:
            with open(label_path, 'r') as f:
                lines = f.readlines()
                
            for line in lines:
                parts = line.strip().split()
                if len(parts) >= 5:  # YOLO format: class x y w h
                    class_id = int(parts[0])
                    w, h = float(parts[3]), float(parts[4])
                    
                    class_counts[class_id] += 1
                    bbox_sizes.append(w * h)  # Normalized area
        except Exception as e:
            print(f"Error processing {label_file}: {e}")
    
    # Print statistics
    print("\nDataset Statistics:")
    print(f"Total images: {total_images}")
    print("\nClass distribution:")
    for class_id, count in class_counts.items():
        print(f"Class {class_id}: {count} objects")
    
    if bbox_sizes:
        print("\nBounding box size statistics:")
        print(f"Average size: {np.mean(bbox_sizes):.4f}")
        print(f"Median size: {np.median(bbox_sizes):.4f}")
        print(f"Min size: {min(bbox_sizes):.4f}")
        print(f"Max size: {max(bbox_sizes):.4f}")
    
    # Plot class distribution
    plt.figure(figsize=(10, 6))
    plt.bar(list(class_counts.keys()), list(class_counts.values()))
    plt.title('Class Distribution')
    plt.xlabel('Class ID')
    plt.ylabel('Number of Objects')
    plt.savefig('class_distribution.png')
    plt.close()
    
    # Plot bbox size distribution
    if bbox_sizes:
        plt.figure(figsize=(10, 6))
        plt.hist(bbox_sizes, bins=50)
        plt.title('Bounding Box Size Distribution')
        plt.xlabel('Normalized Area')
        plt.ylabel('Frequency')
        plt.savefig('bbox_size_distribution.png')
        plt.close()

# Paths
image_dir = r'D:/***'  # Directory containing images
label_dir = r'D:/***'  # Directory containing YOLO format labels

# Run analysis
analyze_dataset_statistics(image_dir, label_dir)
